In [8]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import warnings
import string
In [9]:
import nltk #for working with human language data & text cleaning
In [10]:
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from collections import Counter
from nltk.tokenize import word_tokenize
In [11]:
s_df=pd.read_csv("C:\\Users\\djo16\\Cognifyz\\Dataset-copy(1).csv")
print(s_df.head())
Restaurant ID Restaurant Name Country Code City \
0 6317637 Le Petit Souffle 162 Makati City
1 6304287 Izakaya Kikufuji 162 Makati City
2 6300002 Heat - Edsa Shangri-La 162 Mandaluyong City
3 6318506 Ooma 162 Mandaluyong City
4 6314302 Sambo Kojin 162 Mandaluyong City
Address \
0 Third Floor, Century City Mall, Kalayaan Avenu...
1 Little Tokyo, 2277 Chino Roces Avenue, Legaspi...
2 Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...
3 Third Floor, Mega Fashion Hall, SM Megamall, O...
4 Third Floor, Mega Atrium, SM Megamall, Ortigas...
Locality \
0 Century City Mall, Poblacion, Makati City
1 Little Tokyo, Legaspi Village, Makati City
2 Edsa Shangri-La, Ortigas, Mandaluyong City
3 SM Megamall, Ortigas, Mandaluyong City
4 SM Megamall, Ortigas, Mandaluyong City
Locality Verbose Longitude Latitude \
0 Century City Mall, Poblacion, Makati City, Mak... 121.027535 14.565443
1 Little Tokyo, Legaspi Village, Makati City, Ma... 121.014101 14.553708
2 Edsa Shangri-La, Ortigas, Mandaluyong City, Ma... 121.056831 14.581404
3 SM Megamall, Ortigas, Mandaluyong City, Mandal... 121.056475 14.585318
4 SM Megamall, Ortigas, Mandaluyong City, Mandal... 121.057508 14.584450
Cuisines ... Currency Has Table booking \
0 French, Japanese, Desserts ... Botswana Pula(P) Yes
1 Japanese ... Botswana Pula(P) Yes
2 Seafood, Asian, Filipino, Indian ... Botswana Pula(P) Yes
3 Japanese, Sushi ... Botswana Pula(P) No
4 Japanese, Korean ... Botswana Pula(P) Yes
Has Online delivery Is delivering now Switch to order menu Price range \
0 No No No 3
1 No No No 3
2 No No No 4
3 No No No 4
4 No No No 4
Aggregate rating Rating color Rating text Votes
0 4.8 Dark Green Excellent 314
1 4.5 Dark Green Excellent 591
2 4.4 Green Very Good 270
3 4.9 Dark Green Excellent 365
4 4.8 Dark Green Excellent 229
[5 rows x 21 columns]
Task 1: Restraunt Reviews¶
Analyze the text reviews to identify the mostcommon positive and negative keywords.
In [11]:
ratings = s_df['Rating text']
ratings
Out[11]:
0 Excellent
1 Excellent
2 Very Good
3 Excellent
4 Excellent
...
9546 Very Good
9547 Very Good
9548 Good
9549 Very Good
9550 Very Good
Name: Rating text, Length: 9551, dtype: object
In [13]:
nltk.download('punkt')
nltk.download('stopwords')
[nltk_data] Downloading package punkt to [nltk_data] C:\Users\djo16\AppData\Roaming\nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package stopwords to [nltk_data] C:\Users\djo16\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date!
Out[13]:
True
In [15]:
nltk.download('vader_lexicon')
s_i_a= SentimentIntensityAnalyzer()
stop_words = set(stopwords.words('english'))
[nltk_data] Downloading package vader_lexicon to [nltk_data] C:\Users\djo16\AppData\Roaming\nltk_data... [nltk_data] Package vader_lexicon is already up-to-date!
In [17]:
positive_words =[]
negative_words =[]
In [21]:
for rating_text in ratings:
tokens= word_tokenize(rating_text.lower())
tokens=[token for token in tokens if token.isalpha() and token not in stop_words]
sentiment_score= s_i_a.polarity_scores(rating_text)['compound']
if sentiment_score >= 0.01:
positive_words.extend(tokens)
elif sentiment_score <= 0.01:
negative_words.extend(tokens)
In [23]:
# most common
common_positives= Counter(positive_words)
common_negatives=Counter(negative_words)
In [25]:
num_top_keywords = 10
print('\nTop positive:')
for keyword, count in common_positives.most_common(num_top_keywords):
print(f"{keyword}:{count} times")
Top positive: good:1 times
In [27]:
print("Top negative keywords : ")
for word, count in common_negatives.most_common(num_top_keywords):
print(f"{word}: {count} times")
Top negative keywords :
Calculate the average length of reviews andexplore if there is a relationship betweenreview length and rating.
In [30]:
df_explore = s_df[['Rating text','Aggregate rating']].copy()
df_explore['Review Length'] = df_explore['Rating text'].apply(lambda x: len(str(x)))
avg_review_lengths = df_explore.groupby('Aggregate rating')['Review Length'].mean()
print(avg_review_lengths)
Aggregate rating 0.0 9.0 1.8 4.0 1.9 4.0 2.0 4.0 2.1 4.0 2.2 4.0 2.3 4.0 2.4 4.0 2.5 7.0 2.6 7.0 2.7 7.0 2.8 7.0 2.9 7.0 3.0 7.0 3.1 7.0 3.2 7.0 3.3 7.0 3.4 7.0 3.5 4.0 3.6 4.0 3.7 4.0 3.8 4.0 3.9 4.0 4.0 9.0 4.1 9.0 4.2 9.0 4.3 9.0 4.4 9.0 4.5 9.0 4.6 9.0 4.7 9.0 4.8 9.0 4.9 9.0 Name: Review Length, dtype: float64
In [32]:
plt.figure(figsize=(10,6))
avg_review_lengths.plot(kind='bar',color='purple')
plt.title('Average Review Length of Reviews')
plt.xlabel('Aggregate rating')
plt.ylabel('Average review length character')
plt.show()
Task 2: Votes Analysis¶
Identify the restaurants with the highest andlowest number of votes.
In [36]:
df_v = s_df.dropna(subset=['Votes', 'Restaurant Name'])
df_v= s_df[['Votes', 'Restaurant Name']]
df_v
Out[36]:
| Votes | Restaurant Name | |
|---|---|---|
| 0 | 314 | Le Petit Souffle |
| 1 | 591 | Izakaya Kikufuji |
| 2 | 270 | Heat - Edsa Shangri-La |
| 3 | 365 | Ooma |
| 4 | 229 | Sambo Kojin |
| ... | ... | ... |
| 9546 | 788 | Naml۱ Gurme |
| 9547 | 1034 | Ceviz A��ac۱ |
| 9548 | 661 | Huqqa |
| 9549 | 901 | A���k Kahve |
| 9550 | 591 | Walter's Coffee Roastery |
9551 rows × 2 columns
In [44]:
print("Restaurant(s) with highest votes:")
max_votes = df_v[df_v['Votes'] == df_v['Votes'].max()]
max_votes
Restaurant(s) with highest votes:
Out[44]:
| Votes | Restaurant Name | |
|---|---|---|
| 728 | 10934 | Toit |
In [46]:
print("Restaurant with lowest votes:")
min_votes=df_v.loc[df_v['Votes'].idxmin()]
min_votes
Restaurant with lowest votes:
Out[46]:
Votes 0 Restaurant Name Cantinho da Gula Name: 69, dtype: object
In [ ]:
Analyze if there is a correlation between thenumber of votes and the rating of arestaurant.
In [65]:
print(s_df.columns.tolist())
['Restaurant ID', 'Restaurant Name', 'Country Code', 'City', 'Address', 'Locality', 'Locality Verbose', 'Longitude', 'Latitude', 'Cuisines', 'Average Cost for two', 'Currency', 'Has Table booking', 'Has Online delivery', 'Is delivering now', 'Switch to order menu', 'Price range', 'Aggregate rating', 'Rating color', 'Rating text', 'Votes']
In [63]:
correlation = s_df['Votes'].corr(s_df["Aggregate rating"])
print(f"Correlation between Votes and Rating: {correlation:.2f}")
Correlation between Votes and Rating: 0.31
In [69]:
fig = px.scatter(s_df,
x='Votes',
y='Aggregate rating',
title='Votes vs Aggregate Rating',
labels={'Votes': 'Number of Votes',
'Aggregate rating': 'Aggregate Rating'},
width=1100, height = 800 )
fig.show()
Task 3: Price Range vs Online Delivery & Table Booking¶
Analyze if there is a relationship between the price range and the availability of online delivery and table booking.
In [16]:
s_df.head(5)
Out[16]:
| Restaurant ID | Restaurant Name | Country Code | City | Address | Locality | Locality Verbose | Longitude | Latitude | Cuisines | ... | Currency | Has Table booking | Has Online delivery | Is delivering now | Switch to order menu | Price range | Aggregate rating | Rating color | Rating text | Votes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6317637 | Le Petit Souffle | 162 | Makati City | Third Floor, Century City Mall, Kalayaan Avenu... | Century City Mall, Poblacion, Makati City | Century City Mall, Poblacion, Makati City, Mak... | 121.027535 | 14.565443 | French, Japanese, Desserts | ... | Botswana Pula(P) | Yes | No | No | No | 3 | 4.8 | Dark Green | Excellent | 314 |
| 1 | 6304287 | Izakaya Kikufuji | 162 | Makati City | Little Tokyo, 2277 Chino Roces Avenue, Legaspi... | Little Tokyo, Legaspi Village, Makati City | Little Tokyo, Legaspi Village, Makati City, Ma... | 121.014101 | 14.553708 | Japanese | ... | Botswana Pula(P) | Yes | No | No | No | 3 | 4.5 | Dark Green | Excellent | 591 |
| 2 | 6300002 | Heat - Edsa Shangri-La | 162 | Mandaluyong City | Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal... | Edsa Shangri-La, Ortigas, Mandaluyong City | Edsa Shangri-La, Ortigas, Mandaluyong City, Ma... | 121.056831 | 14.581404 | Seafood, Asian, Filipino, Indian | ... | Botswana Pula(P) | Yes | No | No | No | 4 | 4.4 | Green | Very Good | 270 |
| 3 | 6318506 | Ooma | 162 | Mandaluyong City | Third Floor, Mega Fashion Hall, SM Megamall, O... | SM Megamall, Ortigas, Mandaluyong City | SM Megamall, Ortigas, Mandaluyong City, Mandal... | 121.056475 | 14.585318 | Japanese, Sushi | ... | Botswana Pula(P) | No | No | No | No | 4 | 4.9 | Dark Green | Excellent | 365 |
| 4 | 6314302 | Sambo Kojin | 162 | Mandaluyong City | Third Floor, Mega Atrium, SM Megamall, Ortigas... | SM Megamall, Ortigas, Mandaluyong City | SM Megamall, Ortigas, Mandaluyong City, Mandal... | 121.057508 | 14.584450 | Japanese, Korean | ... | Botswana Pula(P) | Yes | No | No | No | 4 | 4.8 | Dark Green | Excellent | 229 |
5 rows × 21 columns
In [18]:
print(s_df['Price range'].unique())
[3 4 2 1]
In [20]:
print(s_df['Has Online delivery'].unique())
print(s_df['Has Table booking'].unique())
['No' 'Yes'] ['Yes' 'No']
In [41]:
s_df['Has Online delivery'] = s_df['Has Online delivery'].map({'Yes': 1, 'No': 0})
s_df['Has Table booking'] = s_df['Has Table booking'].map({'Yes': 1, 'No': 0})
In [43]:
new_table = s_df.groupby('Price range')[['Has Online delivery', 'Has Table booking']].mean()
In [ ]:
new_table = new_table.rename(columns={
'Has Online delivery binary': 'Has Online delivery',
'Has Table booking binary': 'Has Table booking'
})
In [47]:
new_table = new_table.reset_index()
new_table
Out[47]:
| index | Price range | Has Online delivery | Has Table booking | |
|---|---|---|---|---|
| 0 | 0 | 1 | 0.157741 | 0.000225 |
| 1 | 1 | 2 | 0.413106 | 0.076775 |
| 2 | 2 | 3 | 0.291903 | 0.457386 |
| 3 | 3 | 4 | 0.090444 | 0.467577 |
In [49]:
fig = px.bar(new_table,
x='Price range',
y=['Has Online delivery', 'Has Table booking'],
barmode='group',
title='Average Online Delivery and Table Booking by Price Range',
labels={'Price range': 'Price Range',
'Has Online delivery': 'Avg. Online Delivery (1=Yes, 0=No)',
'Has Table booking': 'Avg. Table Booking (1=Yes, 0=No)'})
fig.show()
In [53]:
delivery=pd.crosstab(s_df['Price range'], s_df['Has Online delivery'])
booking = pd.crosstab(s_df['Price range'], s_df['Has Table booking'])
print (delivery ,'\n' , booking)
Has Online delivery 0 1 Price range 1 3743 701 2 1827 1286 3 997 411 4 533 53 Has Table booking 0 1 Price range 1 4443 1 2 2874 239 3 764 644 4 312 274
In [55]:
combined = pd.concat(
[delivery.rename(columns={0: 'No_Online', 1: 'Yes_Online'}),
booking.rename(columns={0: 'No_Booking', 1: 'Yes_Booking'})],
axis=1
)
print("\nCombined Cross Tab:", combined)
Combined Cross Tab: No_Online Yes_Online No_Booking Yes_Booking Price range 1 3743 701 4443 1 2 1827 1286 2874 239 3 997 411 764 644 4 533 53 312 274
In [57]:
combined['Price range'] = ['Low', 'Medium', 'High', 'Very High']
combined = combined[['Price range', 'No_Online', 'Yes_Online', 'No_Booking', 'Yes_Booking']]
In [59]:
combined_1 = pd.melt(
combined,
id_vars='Price range',
value_vars=['No_Online', 'Yes_Online', 'No_Booking', 'Yes_Booking'],
var_name='Service_Status',
value_name='Count'
)
fig = px.bar(
combined_1,
x='Price range',
y='Count',
color='Service_Status',
barmode='group',
title='Availability of Online Delivery and Table Booking by Price Range'
)
fig.show()
Determine if higher-priced restaurants aremore likely to offer these services.
In [62]:
# Plot Online Delivery vs Price Range
sns.barplot(x=new_table.index, y=new_table['Has Online delivery'])
plt.title("Online Delivery by Price Range")
plt.ylabel("Proportion Offering Online Delivery")
plt.xlabel("Price Range")
plt.show()
# Plot Table Booking vs Price Range
sns.barplot(x=new_table.index, y=new_table['Has Table booking'])
plt.title("Table Booking by Price Range")
plt.ylabel("Proportion Offering Table Booking")
plt.xlabel("Price Range")
plt.show()
In [64]:
combined_1 = pd.melt(
combined,
id_vars='Price range',
value_vars=['No_Online', 'Yes_Online', 'No_Booking', 'Yes_Booking'],
var_name='Service_Status',
value_name='Count'
)
fig = px.bar(
combined_1,
x='Price range',
y='Count',
color='Service_Status',
barmode='group',
title='Availability of Online Delivery and Table Booking by Price Range'
)
fig.show()
In [ ]: